import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
def read_data(path):
df=pd.read_csv(path, low_memory=False, encoding='latin1')
return df
airport_path="Airport_Codes.csv"
Airportcodes_df=read_data(airport_path)
Flights_path="Flights.csv"
Flights_df=read_data(Flights_path)
Tickets_path="Tickets.csv"
Tickets_df=read_data(Tickets_path)
print(Airportcodes_df.shape)
print(Flights_df.shape)
print (Tickets_df.shape)
(55369, 8) (1915886, 16) (1167285, 12)
print(Airportcodes_df.head())
Airportcodes_df.isnull().sum()
TYPE NAME ELEVATION_FT CONTINENT \ 0 heliport Total Rf Heliport 11.0 NaN 1 small_airport Aero B Ranch Airport 3435.0 NaN 2 small_airport Lowell Field 450.0 NaN 3 small_airport Epps Airpark 820.0 NaN 4 closed Newport Hospital & Clinic Heliport 237.0 NaN ISO_COUNTRY MUNICIPALITY IATA_CODE COORDINATES 0 US Bensalem NaN -74.93360137939453, 40.07080078125 1 US Leoti NaN -101.473911, 38.704022 2 US Anchor Point NaN -151.695999146, 59.94919968 3 US Harvest NaN -86.77030181884766, 34.86479949951172 4 US Newport NaN -91.254898, 35.6087
TYPE 0 NAME 0 ELEVATION_FT 7015 CONTINENT 27843 ISO_COUNTRY 247 MUNICIPALITY 5706 IATA_CODE 46187 COORDINATES 0 dtype: int64
print(Flights_df. head())
Flights_df.isnull().sum()
FL_DATE OP_CARRIER TAIL_NUM OP_CARRIER_FL_NUM ORIGIN_AIRPORT_ID ORIGIN \ 0 2019-03-02 WN N955WN 4591 14635 RSW 1 2019-03-02 WN N8686A 3231 14635 RSW 2 2019-03-02 WN N201LV 3383 14635 RSW 3 2019-03-02 WN N413WN 5498 14635 RSW 4 2019-03-02 WN N7832A 6933 14635 RSW ORIGIN_CITY_NAME DEST_AIRPORT_ID DESTINATION DEST_CITY_NAME DEP_DELAY \ 0 Fort Myers, FL 11042 CLE Cleveland, OH -8.0 1 Fort Myers, FL 11066 CMH Columbus, OH 1.0 2 Fort Myers, FL 11066 CMH Columbus, OH 0.0 3 Fort Myers, FL 11066 CMH Columbus, OH 11.0 4 Fort Myers, FL 11259 DAL Dallas, TX 0.0 ARR_DELAY CANCELLED AIR_TIME DISTANCE OCCUPANCY_RATE 0 -6.0 0.0 143.0 1025.0 0.97 1 5.0 0.0 135.0 930.0 0.55 2 4.0 0.0 132.0 930.0 0.91 3 14.0 0.0 136.0 930.0 0.67 4 -17.0 0.0 151.0 1005.0 0.62
FL_DATE 0 OP_CARRIER 0 TAIL_NUM 12156 OP_CARRIER_FL_NUM 0 ORIGIN_AIRPORT_ID 0 ORIGIN 0 ORIGIN_CITY_NAME 0 DEST_AIRPORT_ID 0 DESTINATION 0 DEST_CITY_NAME 0 DEP_DELAY 50351 ARR_DELAY 55991 CANCELLED 0 AIR_TIME 56551 DISTANCE 630 OCCUPANCY_RATE 310 dtype: int64
print(Tickets_df.head())
Tickets_df.isnull().sum()
ITIN_ID YEAR QUARTER ORIGIN ORIGIN_COUNTRY ORIGIN_STATE_ABR \ 0 201912723049 2019 1 ABI US TX 1 201912723085 2019 1 ABI US TX 2 201912723491 2019 1 ABI US TX 3 201912723428 2019 1 ABI US TX 4 201912723509 2019 1 ABI US TX ORIGIN_STATE_NM ROUNDTRIP REPORTING_CARRIER PASSENGERS ITIN_FARE \ 0 Texas 1.0 MQ 1.0 736.0 1 Texas 1.0 MQ 1.0 570.0 2 Texas 1.0 MQ 1.0 564.0 3 Texas 1.0 MQ 1.0 345.0 4 Texas 0.0 MQ 1.0 309.0 DESTINATION 0 DAB 1 COS 2 MCO 3 LGA 4 MGM
ITIN_ID 0 YEAR 0 QUARTER 0 ORIGIN 0 ORIGIN_COUNTRY 0 ORIGIN_STATE_ABR 0 ORIGIN_STATE_NM 0 ROUNDTRIP 0 REPORTING_CARRIER 0 PASSENGERS 1977 ITIN_FARE 960 DESTINATION 0 dtype: int64
# Function to reduce memory usage by adjusting data types - RESULABLE CODE
def reduce_memory_usage(df):
for column in df.columns:
if df[column].dtype == 'int64':
df[column] = df[column].astype('int32')
elif df[column].dtype == 'float64':
df[column] = df[column].astype('float32')
elif df[column].dtype == 'object':
df[column] = df[column].fillna("null")
return df
# Function to convert specified columns to numeric types - RESULABLE CODE
def convert_to_numeric(df, column_names):
for column in column_names:
df[column] = pd.to_numeric(df[column], errors='coerce')
return df
Airportcodes_df=reduce_memory_usage(Airportcodes_df)
Flights_df=reduce_memory_usage(Flights_df)
Tickets_df=reduce_memory_usage(Tickets_df)
# Filter rows where TYPE is 'medium_airport' or 'large_airport'
Airportcodes_df = Airportcodes_df[Airportcodes_df['TYPE'].isin(['medium_airport', 'large_airport'])]
Airportcodes_df
| TYPE | NAME | ELEVATION_FT | CONTINENT | ISO_COUNTRY | MUNICIPALITY | IATA_CODE | COORDINATES | |
|---|---|---|---|---|---|---|---|---|
| 6194 | medium_airport | Aleknagik / New Airport | 66.0 | null | US | Aleknagik | WKK | -158.617996216, 59.2826004028 |
| 10444 | medium_airport | Honiara International Airport | 28.0 | OC | SB | Honiara | HIR | 160.05499267578, -9.4280004501343 |
| 10449 | medium_airport | Munda Airport | 10.0 | OC | SB | null | MUA | 157.26300048828125, -8.327969551086426 |
| 10471 | medium_airport | Hongyuan Airport | 11600.0 | AS | CN | Aba | AHJ | 102.35224, 32.53154 |
| 10695 | medium_airport | Nauru International Airport | 22.0 | OC | NR | Yaren District | INU | 166.919006, -0.547458 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 55361 | medium_airport | Tonghua Sanyuanpu Airport | 1200.0 | AS | CN | Tonghua | TNH | 125.703333333, 42.2538888889 |
| 55362 | large_airport | Taoxian Airport | 198.0 | AS | CN | Shenyang | SHE | 123.48300170898438, 41.639801025390625 |
| 55363 | medium_airport | Yanji Chaoyangchuan Airport | 624.0 | AS | CN | Yanji | YNJ | 129.451004028, 42.8828010559 |
| 55364 | medium_airport | Yingkou Lanqi Airport | 0.0 | AS | CN | Yingkou | YKH | 122.3586, 40.542524 |
| 55365 | medium_airport | Shenyang Dongta Airport | NaN | AS | CN | Shenyang | null | 123.49600219726562, 41.784400939941406 |
5146 rows × 8 columns
# Keep only rows where ISO_COUNTRY is equal to 'US'
Airportcodes_df = Airportcodes_df[Airportcodes_df['ISO_COUNTRY'] == 'US']
# Reset the index in place
Airportcodes_df.reset_index(drop=True, inplace=True)
Airportcodes_df
| TYPE | NAME | ELEVATION_FT | CONTINENT | ISO_COUNTRY | MUNICIPALITY | IATA_CODE | COORDINATES | |
|---|---|---|---|---|---|---|---|---|
| 0 | medium_airport | Aleknagik / New Airport | 66.0 | null | US | Aleknagik | WKK | -158.617996216, 59.2826004028 |
| 1 | medium_airport | South Alabama Regional At Bill Benton Field Ai... | 310.0 | null | US | Andalusia/Opp | null | -86.393799, 31.3088 |
| 2 | medium_airport | Lehigh Valley International Airport | 393.0 | null | US | Allentown | ABE | -75.44080352783203, 40.652099609375 |
| 3 | medium_airport | Abilene Regional Airport | 1791.0 | null | US | Abilene | ABI | -99.68190002440001, 32.4113006592 |
| 4 | large_airport | Albuquerque International Sunport | 5355.0 | null | US | Albuquerque | ABQ | -106.609001, 35.040199 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 853 | medium_airport | Jim's Private Airport | 890.0 | null | US | Conyers | null | -84.14440155, 33.59790039 |
| 854 | medium_airport | Yuma Auxiliary AAF #2 | NaN | null | US | null | null | -114.511383, 32.548984 |
| 855 | large_airport | atl | NaN | null | US | null | null | -84.375, 33.137551 |
| 856 | medium_airport | Williston Basin International Airport (U.C.) | 2344.0 | null | US | Williston | null | -103.748797, 48.258387 |
| 857 | medium_airport | 34S Airport | NaN | null | US | null | null | -16.875, 19.145168 |
858 rows × 8 columns
#checking if filter conditions are working right
Airportcodes_df['TYPE'].unique()
array(['medium_airport', 'large_airport'], dtype=object)
Airportcodes_df.isnull().sum()
TYPE 0 NAME 0 ELEVATION_FT 3 CONTINENT 0 ISO_COUNTRY 0 MUNICIPALITY 0 IATA_CODE 0 COORDINATES 0 dtype: int64
#dropping unncessary columns
Airportcodes_df=Airportcodes_df.drop(['CONTINENT','ISO_COUNTRY','MUNICIPALITY','COORDINATES','ELEVATION_FT'],axis=1)
Airportcodes_df
| TYPE | NAME | IATA_CODE | |
|---|---|---|---|
| 0 | medium_airport | Aleknagik / New Airport | WKK |
| 1 | medium_airport | South Alabama Regional At Bill Benton Field Ai... | null |
| 2 | medium_airport | Lehigh Valley International Airport | ABE |
| 3 | medium_airport | Abilene Regional Airport | ABI |
| 4 | large_airport | Albuquerque International Sunport | ABQ |
| ... | ... | ... | ... |
| 853 | medium_airport | Jim's Private Airport | null |
| 854 | medium_airport | Yuma Auxiliary AAF #2 | null |
| 855 | large_airport | atl | null |
| 856 | medium_airport | Williston Basin International Airport (U.C.) | null |
| 857 | medium_airport | 34S Airport | null |
858 rows × 3 columns
Airportcodes_df['TYPE'].value_counts()
medium_airport 687 large_airport 171 Name: TYPE, dtype: int64
Flights_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1915886 entries, 0 to 1915885 Data columns (total 16 columns): # Column Dtype --- ------ ----- 0 FL_DATE object 1 OP_CARRIER object 2 TAIL_NUM object 3 OP_CARRIER_FL_NUM object 4 ORIGIN_AIRPORT_ID int32 5 ORIGIN object 6 ORIGIN_CITY_NAME object 7 DEST_AIRPORT_ID int32 8 DESTINATION object 9 DEST_CITY_NAME object 10 DEP_DELAY float32 11 ARR_DELAY float32 12 CANCELLED float32 13 AIR_TIME object 14 DISTANCE object 15 OCCUPANCY_RATE float32 dtypes: float32(4), int32(2), object(10) memory usage: 190.0+ MB
# Including rows
Flights_df = Flights_df[Flights_df['CANCELLED'] == 0]
# Reset the index
Flights_df.reset_index(drop=True, inplace=True)
Flights_df['FL_DATE'].unique() # date format
array(['2019-03-02', '2019-03-03', '2019-03-09', '2019-03-10',
'2019-03-24', '2019-03-28', '2019-03-29', '2019-03-30',
'2019-03-20', '2019-03-25', '2019-03-26', '2019-03-31',
'2019-03-13', '2019-03-14', '2019-03-12', '2019-03-11',
'2019-03-15', '2019-03-16', '2019-03-17', '2019-03-18',
'2019-03-19', '2019-03-21', '2019-03-22', '2019-03-23',
'2019-03-27', '2019-03-01', '2019-03-04', '2019-03-05',
'2019-03-06', '2019-03-07', '2019-03-08', '2019-02-17',
'2019-02-18', '2019-02-19', '2019-02-20', '2019-02-21',
'2019-02-22', '2019-02-23', '2019-02-24', '2019-02-26',
'2019-02-27', '2019-02-28', '2019-02-01', '2019-02-02',
'2019-02-03', '2019-02-04', '2019-02-05', '2019-02-06',
'2019-02-07', '2019-02-08', '2019-02-09', '2019-02-10',
'2019-02-11', '2019-02-12', '2019-02-13', '2019-02-14',
'2019-02-15', '2019-02-16', '2019-02-25', '2019-01-21',
'2019-01-24', '2019-01-25', '2019-01-26', '2019-01-27',
'2019-01-28', '2019-01-31', '2019-01-01', '2019-01-02',
'2019-01-03', '2019-01-04', '2019-01-05', '2019-01-06',
'2019-01-07', '2019-01-08', '2019-01-09', '2019-01-10',
'2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14',
'2019-01-15', '2019-01-16', '2019-01-17', '2019-01-18',
'2019-01-19', '2019-01-22', '2019-01-23', '2019-01-29',
'2019-01-30', '2019-01-20', '2/2/19', '3/2/19', '1/2/19',
'1/12/19', '2/23/19', '2/5/19', '1/31/19', '2/21/19', '1/1/19',
'1/23/19', '1/9/19', '2/9/19', '1/8/19', '1/18/19', '3/22/19',
'3/23/19', '3/24/19', '3/26/19', '3/27/19', '3/28/19', '3/29/19',
'3/30/19', '3/31/19', '3/1/19', '3/3/19', '3/4/19', '3/5/19',
'3/6/19', '3/7/19', '3/8/19', '3/9/19', '3/10/19', '3/11/19',
'3/12/19', '3/13/19', '3/14/19', '3/15/19', '3/16/19', '3/17/19',
'3/18/19', '3/19/19', '3/20/19', '3/21/19', '3/25/19'],
dtype=object)
Flights_df['FL_DATE'] = pd.to_datetime(Flights_df['FL_DATE'], format='%Y-%m-%d', errors='coerce')
# Try to convert remaining NaT using '%m/%d/%y' format
Flights_df['FL_DATE'] = Flights_df['FL_DATE'].combine_first(pd.to_datetime(Flights_df['FL_DATE'], format='%m/%d/%y', errors='coerce'))
# Convert all dates to 'YYYY-MM-DD' format as strings
Flights_df['FL_DATE'] = Flights_df['FL_DATE'].dt.strftime('%Y-%m-%d')
Flights_df['FL_DATE'].unique()
array(['2019-03-02', '2019-03-03', '2019-03-09', '2019-03-10',
'2019-03-24', '2019-03-28', '2019-03-29', '2019-03-30',
'2019-03-20', '2019-03-25', '2019-03-26', '2019-03-31',
'2019-03-13', '2019-03-14', '2019-03-12', '2019-03-11',
'2019-03-15', '2019-03-16', '2019-03-17', '2019-03-18',
'2019-03-19', '2019-03-21', '2019-03-22', '2019-03-23',
'2019-03-27', '2019-03-01', '2019-03-04', '2019-03-05',
'2019-03-06', '2019-03-07', '2019-03-08', '2019-02-17',
'2019-02-18', '2019-02-19', '2019-02-20', '2019-02-21',
'2019-02-22', '2019-02-23', '2019-02-24', '2019-02-26',
'2019-02-27', '2019-02-28', '2019-02-01', '2019-02-02',
'2019-02-03', '2019-02-04', '2019-02-05', '2019-02-06',
'2019-02-07', '2019-02-08', '2019-02-09', '2019-02-10',
'2019-02-11', '2019-02-12', '2019-02-13', '2019-02-14',
'2019-02-15', '2019-02-16', '2019-02-25', '2019-01-21',
'2019-01-24', '2019-01-25', '2019-01-26', '2019-01-27',
'2019-01-28', '2019-01-31', '2019-01-01', '2019-01-02',
'2019-01-03', '2019-01-04', '2019-01-05', '2019-01-06',
'2019-01-07', '2019-01-08', '2019-01-09', '2019-01-10',
'2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14',
'2019-01-15', '2019-01-16', '2019-01-17', '2019-01-18',
'2019-01-19', '2019-01-22', '2019-01-23', '2019-01-29',
'2019-01-30', '2019-01-20', nan], dtype=object)
# Converts char to numbers
Flights_df = convert_to_numeric(Flights_df,["DISTANCE","AIR_TIME"])
Flights_df.duplicated().sum()
4410
Flights_df = Flights_df.drop_duplicates()
Flights_df
| FL_DATE | OP_CARRIER | TAIL_NUM | OP_CARRIER_FL_NUM | ORIGIN_AIRPORT_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_ID | DESTINATION | DEST_CITY_NAME | DEP_DELAY | ARR_DELAY | CANCELLED | AIR_TIME | DISTANCE | OCCUPANCY_RATE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-03-02 | WN | N955WN | 4591 | 14635 | RSW | Fort Myers, FL | 11042 | CLE | Cleveland, OH | -8.0 | -6.0 | 0.0 | 143.0 | 1025.0 | 0.970000 |
| 1 | 2019-03-02 | WN | N8686A | 3231 | 14635 | RSW | Fort Myers, FL | 11066 | CMH | Columbus, OH | 1.0 | 5.0 | 0.0 | 135.0 | 930.0 | 0.550000 |
| 2 | 2019-03-02 | WN | N201LV | 3383 | 14635 | RSW | Fort Myers, FL | 11066 | CMH | Columbus, OH | 0.0 | 4.0 | 0.0 | 132.0 | 930.0 | 0.910000 |
| 3 | 2019-03-02 | WN | N413WN | 5498 | 14635 | RSW | Fort Myers, FL | 11066 | CMH | Columbus, OH | 11.0 | 14.0 | 0.0 | 136.0 | 930.0 | 0.670000 |
| 4 | 2019-03-02 | WN | N7832A | 6933 | 14635 | RSW | Fort Myers, FL | 11259 | DAL | Dallas, TX | 0.0 | -17.0 | 0.0 | 151.0 | 1005.0 | 0.620000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1859857 | NaN | AA | N903NN | 1433 | 15370 | TUL | Tulsa, OK | 11057 | CLT | Charlotte, NC | -9.0 | -6.0 | 0.0 | 112.0 | NaN | 0.794885 |
| 1859858 | NaN | AA | N965AN | 1433 | 15370 | TUL | Tulsa, OK | 11057 | CLT | Charlotte, NC | -2.0 | -1.0 | 0.0 | 106.0 | NaN | 0.538399 |
| 1859859 | NaN | AA | N979NN | 1433 | 15370 | TUL | Tulsa, OK | 11057 | CLT | Charlotte, NC | -8.0 | -25.0 | 0.0 | 106.0 | NaN | 0.955579 |
| 1859860 | NaN | AA | N872NN | 1433 | 15370 | TUL | Tulsa, OK | 11057 | CLT | Charlotte, NC | -9.0 | -6.0 | 0.0 | 112.0 | NaN | 0.595344 |
| 1859861 | NaN | AA | N945AN | 1433 | 15370 | TUL | Tulsa, OK | 11057 | CLT | Charlotte, NC | -8.0 | 5.0 | 0.0 | 117.0 | NaN | 0.350192 |
1859862 rows × 16 columns
# null values in each column for Flights_df
Flights_df.isna().sum()
FL_DATE 490 OP_CARRIER 0 TAIL_NUM 0 OP_CARRIER_FL_NUM 0 ORIGIN_AIRPORT_ID 0 ORIGIN 0 ORIGIN_CITY_NAME 0 DEST_AIRPORT_ID 0 DESTINATION 0 DEST_CITY_NAME 0 DEP_DELAY 0 ARR_DELAY 4368 CANCELLED 0 AIR_TIME 4616 DISTANCE 268 OCCUPANCY_RATE 31 dtype: int64
# Drop rows with NaN values in the FL_DATE column
Flights_df = Flights_df.dropna(subset=['FL_DATE'])
Flights_df.describe()
| ORIGIN_AIRPORT_ID | DEST_AIRPORT_ID | DEP_DELAY | ARR_DELAY | CANCELLED | AIR_TIME | DISTANCE | OCCUPANCY_RATE | |
|---|---|---|---|---|---|---|---|---|
| count | 1.859372e+06 | 1.859372e+06 | 1.859372e+06 | 1.855005e+06 | 1859372.0 | 1.855005e+06 | 1.859372e+06 | 1.859372e+06 |
| mean | 1.268605e+04 | 1.268609e+04 | 1.078339e+01 | 5.661498e+00 | 0.0 | 1.093362e+02 | 7.722384e+02 | 6.506156e-01 |
| std | 1.521646e+03 | 1.521882e+03 | 5.005328e+01 | 5.237710e+01 | 0.0 | 7.023561e+01 | 5.816798e+02 | 2.017982e-01 |
| min | 1.013500e+04 | 1.013500e+04 | -6.300000e+01 | -9.400000e+01 | 0.0 | 4.000000e+00 | 3.100000e+01 | 3.000000e-01 |
| 25% | 1.129200e+04 | 1.129200e+04 | -6.000000e+00 | -1.500000e+01 | 0.0 | 5.900000e+01 | 3.440000e+02 | 4.800000e-01 |
| 50% | 1.288900e+04 | 1.288900e+04 | -2.000000e+00 | -6.000000e+00 | 0.0 | 9.100000e+01 | 6.120000e+02 | 6.500000e-01 |
| 75% | 1.405700e+04 | 1.405700e+04 | 7.000000e+00 | 8.000000e+00 | 0.0 | 1.390000e+02 | 1.013000e+03 | 8.200000e-01 |
| max | 1.621800e+04 | 1.621800e+04 | 2.941000e+03 | 2.923000e+03 | 0.0 | 1.557000e+03 | 4.983000e+03 | 1.000000e+00 |
import statistics
# Filling missing values with the median - RESULABLE CODE
def fill_missing_with_median(df, columns):
for column in columns:
non_na_values = [x for x in df[column] if pd.notna(x)]
median_of_column = statistics.median(non_na_values)
df[column].fillna(median_of_column, inplace=True)
# Imputing missing values in columns with the median
impute_columns = ['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE']
fill_missing_with_median(Flights_df, impute_columns)
Flights_df.isnull().sum()
FL_DATE 0 OP_CARRIER 0 TAIL_NUM 0 OP_CARRIER_FL_NUM 0 ORIGIN_AIRPORT_ID 0 ORIGIN 0 ORIGIN_CITY_NAME 0 DEST_AIRPORT_ID 0 DESTINATION 0 DEST_CITY_NAME 0 DEP_DELAY 0 ARR_DELAY 0 CANCELLED 0 AIR_TIME 0 DISTANCE 0 OCCUPANCY_RATE 0 dtype: int64
Tickets_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1167285 entries, 0 to 1167284 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ITIN_ID 1167285 non-null int32 1 YEAR 1167285 non-null int32 2 QUARTER 1167285 non-null int32 3 ORIGIN 1167285 non-null object 4 ORIGIN_COUNTRY 1167285 non-null object 5 ORIGIN_STATE_ABR 1167285 non-null object 6 ORIGIN_STATE_NM 1167285 non-null object 7 ROUNDTRIP 1167285 non-null float32 8 REPORTING_CARRIER 1167285 non-null object 9 PASSENGERS 1165308 non-null float32 10 ITIN_FARE 1167285 non-null object 11 DESTINATION 1167285 non-null object dtypes: float32(2), int32(3), object(7) memory usage: 84.6+ MB
Tickets_df['ITIN_FARE'] = pd.to_numeric(Tickets_df['ITIN_FARE'], errors='coerce')
#Filtering the Tickets data for only "roundtrips"
Tickets_df = Tickets_df[Tickets_df["ROUNDTRIP"] == 1].reset_index(drop=True)
Tickets_df.duplicated().sum()
47564
# Dropping the 47564 duplicates with respect to all the columns
Tickets_df = Tickets_df.drop_duplicates()
#checking number of null values in each column
Tickets_df.isna().sum()
ITIN_ID 0 YEAR 0 QUARTER 0 ORIGIN 0 ORIGIN_COUNTRY 0 ORIGIN_STATE_ABR 0 ORIGIN_STATE_NM 0 ROUNDTRIP 0 REPORTING_CARRIER 0 PASSENGERS 960 ITIN_FARE 1657 DESTINATION 0 dtype: int64
Tickets_df.describe()
| ITIN_ID | YEAR | QUARTER | ROUNDTRIP | PASSENGERS | ITIN_FARE | |
|---|---|---|---|---|---|---|
| count | 6.610360e+05 | 661036.0 | 661036.0 | 661036.0 | 660076.000000 | 659379.000000 |
| mean | -9.078560e+07 | 2019.0 | 1.0 | 1.0 | 1.961235 | 473.245517 |
| std | 5.511631e+08 | 0.0 | 0.0 | 0.0 | 5.152424 | 344.206517 |
| min | -1.283736e+09 | 2019.0 | 1.0 | 1.0 | 1.000000 | 0.000000 |
| 25% | 4.820312e+07 | 2019.0 | 1.0 | 1.0 | 1.000000 | 280.000000 |
| 50% | 4.946054e+07 | 2019.0 | 1.0 | 1.0 | 1.000000 | 415.000000 |
| 75% | 5.082226e+07 | 2019.0 | 1.0 | 1.0 | 1.000000 | 595.000000 |
| max | 2.019200e+09 | 2019.0 | 1.0 | 1.0 | 681.000000 | 38400.000000 |
# Imputing missing values in columns with the median
impute_columns_tickets = ['PASSENGERS', 'ITIN_FARE']
fill_missing_with_median(Tickets_df, impute_columns_tickets)
To optimize merging, I am first aggregating the ticketsand flights data by origin and destination, summarizing key metrics (e.g., mean ticket price, total flights). This reduces data size and complexity. Afterward, I'll merge these aggregated datasets with airport_codes, streamlining computations and improving efficiency.
Advantages of This Approach: Scalability: This method is well-suited for larger datasets, as it reduces memory overhead.
Readability: Aggregating at the source level simplifies subsequent analysis and avoids complex joins on larger datasets.
Flexibility: You can easily add or adjust the aggregate metrics without reworking the entire process.
Tickets_df=Tickets_df.groupby(['ORIGIN','DESTINATION']).agg({'ORIGIN_STATE_NM':'first',
'PASSENGERS': np.sum,
'ITIN_FARE': np.sum}).reset_index()
## Canceled flights are already filtered out, but we’ll still count total flights per route for analysis
Flights_df=Flights_df.groupby(['ORIGIN','DESTINATION']).agg({'ORIGIN_CITY_NAME':'first',
'DEST_CITY_NAME': 'first',
'DEP_DELAY': np.sum,
'ARR_DELAY': np.sum,
'AIR_TIME': np.sum,
'DISTANCE': np.sum,
'OCCUPANCY_RATE':np.sum,
'CANCELLED':'count'}).reset_index()
Tickets_df
| ORIGIN | DESTINATION | ORIGIN_STATE_NM | PASSENGERS | ITIN_FARE | |
|---|---|---|---|---|---|
| 0 | ABE | ABQ | Pennsylvania | 2.0 | 1068.0 |
| 1 | ABE | AGS | Pennsylvania | 1.0 | 299.0 |
| 2 | ABE | AMA | Pennsylvania | 1.0 | 654.0 |
| 3 | ABE | ASE | Pennsylvania | 2.0 | 1484.0 |
| 4 | ABE | ATL | Pennsylvania | 56.0 | 25820.0 |
| ... | ... | ... | ... | ... | ... |
| 40285 | YUM | SMF | Arizona | 2.0 | 553.0 |
| 40286 | YUM | SRQ | Arizona | 1.0 | 696.0 |
| 40287 | YUM | TOL | Arizona | 1.0 | 335.0 |
| 40288 | YUM | TPA | Arizona | 3.0 | 1633.0 |
| 40289 | YUM | TUL | Arizona | 1.0 | 576.0 |
40290 rows × 5 columns
# Merge tickets with flights on origin and destination
final_df = Tickets_df.merge(Flights_df, on=['ORIGIN', 'DESTINATION'], suffixes=('_tickets', '_flights'))
# Merge with airport_codes for origin airport details
final_df = final_df.merge(Airportcodes_df, left_on='ORIGIN', right_on='IATA_CODE', suffixes=('', '_origin'))
# Merge with airport_codes again for destination airport details
final_df = final_df.merge(Airportcodes_df, left_on='DESTINATION', right_on='IATA_CODE', suffixes=('_origin', '_destination'))
# Drop redundant columns
final_df.drop(columns=['IATA_CODE_origin', 'IATA_CODE_destination'], inplace=True)
# Rename column if necessary
final_df.rename(columns={'CANCELLED': 'No_of_Flights'}, inplace=True)
# Create route column for clarity
final_df['route'] = final_df['ORIGIN_CITY_NAME'] + ' TO ' + final_df['DEST_CITY_NAME']
# Create a copy of final data for safety
final_data = final_df.copy()
# Calculating total_delay
final_data['total_delay'] = final_data['ARR_DELAY'] + final_data['DEP_DELAY']
#Calculating Avergare delay
final_data['AVG_delay'] = final_data['total_delay'] / final_data['No_of_Flights']
final_data['AVG_OCCUPANCY_RATE']=final_data['OCCUPANCY_RATE']/final_data['No_of_Flights']
final_data
| ORIGIN | DESTINATION | ORIGIN_STATE_NM | PASSENGERS | ITIN_FARE | ORIGIN_CITY_NAME | DEST_CITY_NAME | DEP_DELAY | ARR_DELAY | AIR_TIME | ... | OCCUPANCY_RATE | No_of_Flights | TYPE_origin | NAME_origin | TYPE_destination | NAME_destination | route | total_delay | AVG_delay | AVG_OCCUPANCY_RATE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ABE | ATL | Pennsylvania | 56.0 | 25820.0 | Allentown/Bethlehem/Easton, PA | Atlanta, GA | 1298.0 | 1185.0 | 23225.0 | ... | 139.220001 | 217 | medium_airport | Lehigh Valley International Airport | large_airport | Hartsfield Jackson Atlanta International Airport | Allentown/Bethlehem/Easton, PA TO Atlanta, GA | 2483.0 | 11.442396 | 0.641567 |
| 1 | ABQ | ATL | New Mexico | 62.0 | 25501.0 | Albuquerque, NM | Atlanta, GA | 511.0 | -1208.0 | 23977.0 | ... | 106.739998 | 165 | large_airport | Albuquerque International Sunport | large_airport | Hartsfield Jackson Atlanta International Airport | Albuquerque, NM TO Atlanta, GA | -697.0 | -4.224242 | 0.646909 |
| 2 | ABY | ATL | Georgia | 5.0 | 1729.0 | Albany, GA | Atlanta, GA | 1700.0 | -51.0 | 9281.0 | ... | 161.809998 | 249 | medium_airport | Southwest Georgia Regional Airport | large_airport | Hartsfield Jackson Atlanta International Airport | Albany, GA TO Atlanta, GA | 1649.0 | 6.622490 | 0.649839 |
| 3 | AEX | ATL | Louisiana | 16.0 | 5623.0 | Alexandria, LA | Atlanta, GA | 782.0 | -1843.0 | 23144.0 | ... | 205.389999 | 320 | medium_airport | Alexandria International Airport | large_airport | Hartsfield Jackson Atlanta International Airport | Alexandria, LA TO Atlanta, GA | -1061.0 | -3.315625 | 0.641844 |
| 4 | AGS | ATL | Georgia | 3.0 | 723.0 | Augusta, GA | Atlanta, GA | 5904.0 | 296.0 | 23421.0 | ... | 446.399994 | 683 | large_airport | Augusta Regional At Bush Field | large_airport | Hartsfield Jackson Atlanta International Airport | Augusta, GA TO Atlanta, GA | 6200.0 | 9.077599 | 0.653587 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5517 | SLC | CDC | Utah | 62.0 | 846.0 | Salt Lake City, UT | Cedar City, UT | 607.0 | 1177.0 | 5862.0 | ... | 100.320000 | 154 | large_airport | Salt Lake City International Airport | medium_airport | Cedar City Regional Airport | Salt Lake City, UT TO Cedar City, UT | 1784.0 | 11.584416 | 0.651429 |
| 5518 | SLC | EKO | Utah | 3.0 | 803.0 | Salt Lake City, UT | Elko, NV | 1419.0 | 489.0 | 5763.0 | ... | 104.669998 | 162 | large_airport | Salt Lake City International Airport | medium_airport | Elko Regional Airport | Salt Lake City, UT TO Elko, NV | 1908.0 | 11.777778 | 0.646111 |
| 5519 | SLC | LWS | Utah | 21.0 | 2078.0 | Salt Lake City, UT | Lewiston, ID | -323.0 | -1965.0 | 14113.0 | ... | 128.389999 | 199 | large_airport | Salt Lake City International Airport | medium_airport | Lewiston Nez Perce County Airport | Salt Lake City, UT TO Lewiston, ID | -2288.0 | -11.497487 | 0.645176 |
| 5520 | SLC | PIH | Utah | 3.0 | 573.0 | Salt Lake City, UT | Pocatello, ID | 940.0 | -1149.0 | 8439.0 | ... | 177.919998 | 270 | large_airport | Salt Lake City International Airport | medium_airport | Pocatello Regional Airport | Salt Lake City, UT TO Pocatello, ID | -209.0 | -0.774074 | 0.658963 |
| 5521 | SLC | TWF | Utah | 4.0 | 39179.0 | Salt Lake City, UT | Twin Falls, ID | 963.0 | -325.0 | 8244.0 | ... | 173.250000 | 257 | large_airport | Salt Lake City International Airport | medium_airport | Joslin Field Magic Valley Regional Airport | Salt Lake City, UT TO Twin Falls, ID | 638.0 | 2.482490 | 0.674125 |
5522 rows × 21 columns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'
# Sort data to get top 10 routes with the highest occupancy rates
top_routes = final_data.sort_values(by='OCCUPANCY_RATE', ascending=False).head(10)
# Visualization
fig = px.bar(
top_routes,
x='route',
y='OCCUPANCY_RATE',
color='OCCUPANCY_RATE',
color_continuous_scale='Viridis',
text='OCCUPANCY_RATE',
title='Top 10 Routes by Occupancy Rate',
labels={'route': 'Route', 'OCCUPANCY_RATE': 'Occupancy Rate'},
hover_data={'route': True, 'OCCUPANCY_RATE': True},
)
fig.update_traces(
texttemplate='%{text:.2f}',
textposition='outside',
marker=dict(line=dict(color='black', width=1))
)
fig.update_layout(
xaxis=dict(tickangle=45, showgrid=False),
yaxis=dict(title='Occupancy Rate', showgrid=True),
title_font_size=18,
font=dict(size=14),
height=700,
plot_bgcolor='white',
)
import plotly.express as px
import plotly.graph_objects as go
# Sort data to get top 20 routes with the highest number of flights
top_routes = final_data.sort_values(by='No_of_Flights', ascending=False).head(20)
# Create the bar chart
fig = px.bar(
top_routes,
x='route',
y='AVG_OCCUPANCY_RATE',
text='AVG_OCCUPANCY_RATE',
title='Top 20 Routes by Occupancy Rate',
labels={'AVG_OCCUPANCY_RATE': 'Average Occupancy Rate', 'route': 'Route'},
hover_data={'route': True, 'AVG_OCCUPANCY_RATE': True},
)
fig.add_trace(
go.Scatter(
x=top_routes['route'],
y=top_routes['No_of_Flights'],
mode='lines+markers',
name='Number of Flights',
yaxis='y2',
line=dict(color='red', width=2)
)
)
fig.update_traces(texttemplate='%{text:.2f}')
fig.update_layout(
xaxis=dict(tickangle=45),
yaxis=dict(title='Average Occupancy Rate'),
yaxis2=dict(
title='Number of Flights',
overlaying='y',
side='right',
),
xaxis_title='Route',
title_font_size=16,
font=dict(size=12),
height=650,
)
# Visualization
fig.show()
final_data=final_data[['route','PASSENGERS','ITIN_FARE','DEP_DELAY','ARR_DELAY','DISTANCE','OCCUPANCY_RATE','No_of_Flights','TYPE_origin','TYPE_destination', 'AVG_delay', 'AVG_OCCUPANCY_RATE', 'total_delay']]
## All calculations.
final_data['total_fare']=final_data['PASSENGERS']*final_data['ITIN_FARE']
final_data['baggage_cost']=final_data['OCCUPANCY_RATE']*70*0.5
final_data['arr_delay_cost']=final_data['ARR_DELAY']*75
final_data['dep_delay_cost']=final_data['DEP_DELAY']*75
final_data['airport_cost']=np.where((final_data['TYPE_origin']=='medium_airport'),final_data['No_of_Flights']*5000 , final_data['No_of_Flights']*10000)
final_data['airport_cost']=np.where((final_data['TYPE_destination']=='medium_airport'),final_data['airport_cost']+final_data['No_of_Flights']*5000 , final_data['airport_cost']+final_data['No_of_Flights']*10000)
final_data['essentials_cost']=final_data['DISTANCE']*9.18
final_data['income']=final_data['total_fare']+final_data['baggage_cost']
final_data['cost']=final_data['arr_delay_cost']+final_data['dep_delay_cost']+final_data['airport_cost']+final_data['essentials_cost']
# Calculate profit
final_data['profit'] = final_data['income'] - final_data['cost']
# Sort data to get top 10 routes with the highest profit
top_routes_profit = final_data.sort_values(by='profit', ascending=False).head(10)
# Create the interactive bar chart
fig = px.bar(
top_routes_profit,
x='route',
y='profit',
text='profit',
title='Top 10 Routes by Profit',
labels={'profit': 'Profit', 'route': 'Route'},
hover_data={'route': True, 'profit': True},
)
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
xaxis=dict(tickangle=45),
yaxis=dict(title='Profit'),
xaxis_title='Route',
title_font_size=16,
font=dict(size=12),
height=650,
)
# Visualization
fig.show()
# Calculate profit per trip
final_data['profit_per_trip'] = final_data['profit'] / final_data['No_of_Flights']
# Select top 10 routes by profit per trip
top_routes = final_data.sort_values(by='profit_per_trip', ascending=False).head(10)
# Create an interactive bar chart
fig = px.bar(
top_routes,
x='route',
y='profit_per_trip',
text='profit_per_trip',
title='Top 10 Routes by Profit per Trip',
labels={'profit_per_trip': 'Profit per Trip', 'route': 'Route'},
hover_data={'route': True, 'profit_per_trip': True},
color='profit_per_trip',
color_continuous_scale='viridis'
)
# Customize the layout
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
xaxis=dict(tickangle=45, title='Route'),
yaxis=dict(title='Profit per Trip'),
title_font_size=16,
font=dict(size=12),
height=650,
)
# Visualization
fig.show()
NOTES
To gain a clearer understanding of our market strategy and revenue potential, we need to refine our analysis. While identifying the busiest round trip routes provides a vivid snapshot, it's crucial to align our focus with profitable opportunities.
Let’s strengthen our analysis by classifying airports based on their level of operational activity. This involves identifying airports that qualify as 'busy' to focus our efforts on maximizing customer engagement and driving revenue growth.
It is also crucial to take the route into consideration, so that it helps the analysis, to get a clear understanding on which routes are more occupied and have likelihood of exceeding the threashold for profitability consideration.
Operating a fleet of 5 aircraft presents a fantastic opportunity to establish a highly efficient and flexible aviation operation, whether you're focused on passenger transportation, cargo delivery, or charter services.
Airports with a number of flights exceeding 600 are considered having busy routes, indicating significant operational activity.
This approach integrates operational capacity, profitability considerations, and seasonal adjustments to effectively identify airports with substantial flight activity, aligning with strategic decision-making in aviation operations.
# Identifying busy routes based on a threshold of 600 total flights
final_data['busy_route'] = np.where(final_data['No_of_Flights'] >= 600, 1, 0)
import plotly.express as px
# Filter data for the top 10 most profitable routes
top_profitable_routes = final_data[final_data['busy_route'] == 1].sort_values(by='profit_per_trip', ascending=False).head(10)
# Create an interactive bar chart
fig = px.bar(
top_profitable_routes,
x='route',
y='profit_per_trip',
text='profit_per_trip',
title='Top 10 Most Profitable Routes per Round Trip',
labels={'profit_per_trip': 'Profit per Trip', 'route': 'Route'},
hover_data={'route': True, 'profit_per_trip': True},
color='profit_per_trip',
color_continuous_scale='Viridis'
)
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
xaxis=dict(tickangle=45, title='Route'),
yaxis=dict(title='Profit per Trip'),
title_font_size=16,
font=dict(size=12),
height=650,
)
# Visualization
fig.show()
# Calculate total delay combining departure and arrival delays
final_data['total_delay'] = final_data['DEP_DELAY'] + final_data['ARR_DELAY']
# Filter data for the busiest routes
busiest_routes = final_data[final_data['busy_route'] == 1]
# Select the top 10 routes with the lowest total delay
top_routes_delay = busiest_routes.sort_values(by='total_delay', ascending=True).head(10)
# Create an interactive bar chart
fig = px.bar(
top_routes_delay,
x='route',
y='total_delay',
text='total_delay',
title='Top 10 Routes with Lowest Total Delay',
labels={'total_delay': 'Total Delay (minutes)', 'route': 'Route'},
hover_data={'route': True, 'total_delay': True},
color='total_delay',
color_continuous_scale='Blues'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(
xaxis=dict(tickangle=45, title='Route'),
yaxis=dict(title='Total Delay (minutes)'),
title_font_size=16,
font=dict(size=12),
height=650,
)
# Visualization
fig.show()
final_data
| route | PASSENGERS | ITIN_FARE | DEP_DELAY | ARR_DELAY | DISTANCE | OCCUPANCY_RATE | No_of_Flights | TYPE_origin | TYPE_destination | ... | baggage_cost | arr_delay_cost | dep_delay_cost | airport_cost | essentials_cost | income | cost | profit | profit_per_trip | busy_route | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Allentown/Bethlehem/Easton, PA TO Atlanta, GA | 56.0 | 25820.0 | 1298.0 | 1185.0 | 150164.0 | 139.220001 | 217 | medium_airport | large_airport | ... | 4872.700195 | 88875.0 | 97350.0 | 3255000 | 1378505.52 | 1.450793e+06 | 4819730.52 | -3.368938e+06 | -15525.059077 | 0 |
| 1 | Albuquerque, NM TO Atlanta, GA | 62.0 | 25501.0 | 511.0 | -1208.0 | 209385.0 | 106.739998 | 165 | large_airport | large_airport | ... | 3735.899902 | -90600.0 | 38325.0 | 3300000 | 1922154.30 | 1.584798e+06 | 5169879.30 | -3.585081e+06 | -21727.766061 | 0 |
| 2 | Albany, GA TO Atlanta, GA | 5.0 | 1729.0 | 1700.0 | -51.0 | 36105.0 | 161.809998 | 249 | medium_airport | large_airport | ... | 5663.350098 | -3825.0 | 127500.0 | 3735000 | 331443.90 | 1.430835e+04 | 4190118.90 | -4.175811e+06 | -16770.323494 | 0 |
| 3 | Alexandria, LA TO Atlanta, GA | 16.0 | 5623.0 | 782.0 | -1843.0 | 160000.0 | 205.389999 | 320 | medium_airport | large_airport | ... | 7188.649902 | -138225.0 | 58650.0 | 4800000 | 1468800.00 | 9.715665e+04 | 6189225.00 | -6.092068e+06 | -19037.713594 | 0 |
| 4 | Augusta, GA TO Atlanta, GA | 3.0 | 723.0 | 5904.0 | 296.0 | 97669.0 | 446.399994 | 683 | large_airport | large_airport | ... | 15624.000000 | 22200.0 | 442800.0 | 13660000 | 896601.42 | 1.779300e+04 | 15021601.42 | -1.500381e+07 | -21967.508668 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5517 | Salt Lake City, UT TO Cedar City, UT | 62.0 | 846.0 | 607.0 | 1177.0 | 34034.0 | 100.320000 | 154 | large_airport | medium_airport | ... | 3511.199951 | 88275.0 | 45525.0 | 2310000 | 312432.12 | 5.596320e+04 | 2756232.12 | -2.700269e+06 | -17534.213767 | 0 |
| 5518 | Salt Lake City, UT TO Elko, NV | 3.0 | 803.0 | 1419.0 | 489.0 | 32400.0 | 104.669998 | 162 | large_airport | medium_airport | ... | 3663.449951 | 36675.0 | 106425.0 | 2430000 | 297432.00 | 6.072450e+03 | 2870532.00 | -2.864460e+06 | -17681.849074 | 0 |
| 5519 | Salt Lake City, UT TO Lewiston, ID | 21.0 | 2078.0 | -323.0 | -1965.0 | 91739.0 | 128.389999 | 199 | large_airport | medium_airport | ... | 4493.649902 | -147375.0 | -24225.0 | 2985000 | 842164.02 | 4.813165e+04 | 3655564.02 | -3.607432e+06 | -18127.800855 | 0 |
| 5520 | Salt Lake City, UT TO Pocatello, ID | 3.0 | 573.0 | 940.0 | -1149.0 | 40500.0 | 177.919998 | 270 | large_airport | medium_airport | ... | 6227.199707 | -86175.0 | 70500.0 | 4050000 | 371790.00 | 7.946200e+03 | 4406115.00 | -4.398169e+06 | -16289.514075 | 0 |
| 5521 | Salt Lake City, UT TO Twin Falls, ID | 4.0 | 39179.0 | 963.0 | -325.0 | 44975.0 | 173.250000 | 257 | large_airport | medium_airport | ... | 6063.750000 | -24375.0 | 72225.0 | 3855000 | 412870.50 | 1.627798e+05 | 4315720.50 | -4.152941e+06 | -16159.302529 | 0 |
5522 rows × 24 columns
import plotly.express as px
import plotly.graph_objects as go
# Filter data for the busiest routes
busiest_routes = final_data[final_data['busy_route'] == 1]
# Select the top 10 routes with the lowest total delay, sorted by No_of_Flights
top_routes_delay = busiest_routes.sort_values(by='No_of_Flights', ascending=False).head(10)
# Create an interactive bar chart
fig = px.bar(
top_routes_delay,
x='route',
y='AVG_delay',
text='AVG_delay',
title='Top 10 Routes with Average Delay',
labels={'AVG_delay': 'AVG Delay (minutes)', 'route': 'Route'},
hover_data={'route': True, 'AVG_delay': True},
color='AVG_delay',
color_continuous_scale='Blues'
)
fig.add_trace(
go.Scatter(
x=top_routes_delay['route'],
y=top_routes_delay['No_of_Flights'],
mode='lines+markers',
name='Number of Flights',
yaxis='y2',
line=dict(color='red', width=2)
)
)
fig.update_traces(texttemplate='%{text:.0f}')
fig.update_layout(
xaxis=dict(tickangle=45, title='Route'),
yaxis=dict(title='AVG Delay (minutes)'),
yaxis2=dict(
title='Number of Flights',
overlaying='y',
side='right',
),
title_font_size=16,
font=dict(size=12),
height=650,
)
# Visualization
fig.show()
To achieve company goals, we aim to maximize profitability through strategic initiatives, expand our customer base, and maintain operational excellence with a focus on punctuality. A unified scoring system will evaluate performance, with metrics for profitability, total flights, and delays. The scores will range from 0 to 1, with a weighted average (Delay 40%, Total Flights 30%, Profitability 30%) to assess overall operational effectiveness and alignment with strategic objectives.
from sklearn.preprocessing import MinMaxScaler
# Initialize MinMaxScaler
min_max_scaler = MinMaxScaler()
# Select data for busy routes
busy_routes_data = final_data[final_data['busy_route'] == 1]
# Select columns to scale and transform
columns_to_scale = ['total_delay', 'profit_per_trip', 'No_of_Flights']
scaled_data = min_max_scaler.fit_transform(busy_routes_data[columns_to_scale])
# Inverse transform and assign scaled values back to DataFrame
busy_routes_data['delay_scaled'] = 1 - scaled_data[:, 0]
busy_routes_data['profit_per_trip_scaled'] = scaled_data[:, 1]
busy_routes_data['No_of_Flights_scaled'] = scaled_data[:, 2]
# Calculate score based on weighted average
busy_routes_data['score'] = 0.33 * busy_routes_data['delay_scaled'] + \
0.33 * busy_routes_data['profit_per_trip_scaled'] + \
0.33 * busy_routes_data['No_of_Flights_scaled']
import plotly.express as px
# Sort 'busy_routes_data' by 'score' in descending order to get top 10 routes
top_routes = busy_routes_data.sort_values(by='score', ascending=False).head(10)
# Create an interactive bar chart using Plotly Express
fig = px.bar(
top_routes,
x='route',
y='score',
title='Top 10 Routes by Score',
labels={'score': 'Score', 'route': 'Route'},
hover_data={'route': True, 'score': True},
color='score',
color_continuous_scale='Viridis'
)
fig.update_layout(
xaxis=dict(tickangle=60, title='Route'),
yaxis=dict(title='Score'),
title_font_size=16,
font=dict(size=12),
height=650,
)
# Visalization
fig.show()
Recommended Routes:
New York, NY to Las Vegas, NV New York, NY to Los Angeles, CA Minneapolis, MN to Orlando, FL Los Angeles, CA to New York, NY New York, NY to San Francisco, CA
Rationale:
These routes are recommended based on a comprehensive weighted average analysis that includes KPI's such as the total number of flights (total_count_scaled), profitability per trip (profit_per_trip_scaled), and delay times (delay_scaled).
By analyzing these metrics collectively, the company can adopt a well-rounded strategy that enhances both profitability and operational efficiency. Focusing on routes with strong performance across these metrics will help the company uphold its reputation for punctuality while driving financial success.
import plotly.express as px
# Sort 'busy_routes_data' by 'score' in descending order to get top 5 routes
top_routes = busy_routes_data.sort_values(by='score', ascending=False).head(5)
# Create an interactive bar plot using Plotly
fig = px.bar(top_routes,
x='route',
y='score',
title='Top 5 Recommended Routes',
labels={'route': 'Route', 'score': 'Score'},
text='score')
fig.update_traces(texttemplate='%{text}', textposition='outside', hoverinfo='x+y+text')
# Visualization
fig.show()
top_routes=top_routes.assign(NUMBER_OF_TRIPS_TO_BREAKEVEN = lambda x: ((90000000/x['profit_per_trip'])))
top_routes.assign(NUMBER_OF_TRIPS_TO_BREAKEVEN = lambda x: ((90000000/x['profit_per_trip']))).to_excel('Final_Solution.xlsx')
print(top_routes)
route PASSENGERS ITIN_FARE DEP_DELAY \
1585 New York, NY TO Los Angeles, CA 3580.0 833288.0 15822.0
1399 Los Angeles, CA TO New York, NY 2265.0 670204.0 38189.0
1739 Minneapolis, MN TO Orlando, FL 2529.0 272637.0 7124.0
1485 New York, NY TO Las Vegas, NV 1646.0 438812.0 5260.0
2320 New York, NY TO San Francisco, CA 1803.0 778869.0 26847.0
ARR_DELAY DISTANCE OCCUPANCY_RATE No_of_Flights TYPE_origin \
1585 -332.0 7816050.0 2042.729980 3158 large_airport
1399 -11701.0 7825950.0 2065.739990 3162 large_airport
1739 2161.0 822680.0 410.540009 628 large_airport
1485 -891.0 1958008.0 563.650024 871 large_airport
2320 15346.0 4809960.0 1218.260010 1860 large_airport
TYPE_destination ... income cost profit \
1585 large_airport ... 2.983243e+09 1.360731e+08 2.847169e+09
1399 large_airport ... 1.518084e+09 1.370688e+08 1.381016e+09
1739 large_airport ... 6.895133e+08 2.080858e+07 6.687048e+08
1485 large_airport ... 7.223043e+08 3.572219e+07 6.865821e+08
2320 large_airport ... 1.404343e+09 8.451991e+07 1.319824e+09
profit_per_trip busy_route delay_scaled profit_per_trip_scaled \
1585 9.015736e+05 1 0.858725 0.851698
1399 4.367538e+05 1 0.791528 0.429421
1739 1.064817e+06 1 0.896638 1.000000
1485 7.882688e+05 1 0.926674 0.748763
2320 7.095825e+05 1 0.695571 0.677279
No_of_Flights_scaled score NUMBER_OF_TRIPS_TO_BREAKEVEN
1585 0.715324 0.800497 99.825460
1399 0.716443 0.639339 206.065748
1739 0.007830 0.628474 84.521605
1485 0.075783 0.577903 114.174257
2320 0.352349 0.569316 126.835138
[5 rows x 29 columns]
top_routes[['ITIN_FARE','DEP_DELAY','ARR_DELAY','No_of_Flights','cost','profit','total_delay','NUMBER_OF_TRIPS_TO_BREAKEVEN']]
| ITIN_FARE | DEP_DELAY | ARR_DELAY | No_of_Flights | cost | profit | total_delay | NUMBER_OF_TRIPS_TO_BREAKEVEN | |
|---|---|---|---|---|---|---|---|---|
| 1585 | 833288.0 | 15822.0 | -332.0 | 3158 | 1.360731e+08 | 2.847169e+09 | 15490.0 | 99.825460 |
| 1399 | 670204.0 | 38189.0 | -11701.0 | 3162 | 1.370688e+08 | 1.381016e+09 | 26488.0 | 206.065748 |
| 1739 | 272637.0 | 7124.0 | 2161.0 | 628 | 2.080858e+07 | 6.687048e+08 | 9285.0 | 84.521605 |
| 1485 | 438812.0 | 5260.0 | -891.0 | 871 | 3.572219e+07 | 6.865821e+08 | 4369.0 | 114.174257 |
| 2320 | 778869.0 | 26847.0 | 15346.0 | 1860 | 8.451991e+07 | 1.319824e+09 | 42193.0 | 126.835138 |
Essential KPI's which are already given to us are
Distance Fare Arrival Delay Departure Delay Occupancy Rate
Additional Key Performance Indicators (KPIs) for Airline Analysis:
Demographic insights optimize marketing and service customization.
Running an analysis on the pricing of Economy, Basic Economy, and Business fares can help identify optimal ticket pricing strategies. By adjusting fares based on passenger demand and competition, we can attract more passengers while maximizing revenue across different fare classes.
Evaluating additional services such as meals, luxury products, drinks, travel perks, and discounts can enhance customer satisfaction and drive revenue. Tracking their adoption, costs, and impact on repeat business helps optimize profitability and loyalty.
Weather analysis enhances schedule reliability and passenger satisfaction.
Government Policies supporting tourism and business drive route expansion opportunities.
Tracking overall onboarding experience to can be one of the key parameter for customer satisfaction and customer to return for same flight experience for travelling.
These KPIs enable comprehensive analysis, supporting strategic decisions to optimize operations, revenue, and customer satisfaction in the airline industry.